# ==============================================================================
# file name: 02-recoding-traces.R
# authors: Bernhard Clemm 
# date: February 18, 2022
# description: recodes the trace data and joins to survey data; run after 
#              02-recoding-traces.R
# ==============================================================================

# SETUP ========================================================================

basedir <- paste0(dirname(dirname(
  rstudioapi::getSourceEditorContext()$path)), "/")
datadir <- paste0(basedir, "data/")

library(tidyverse)

# DATA =========================================================================

# This is an export from the table described in "queries-traces.sql"
traces00 <- read.csv(paste0(datadir, "traces/pl_us_traces.csv")) %>%
  # visits outside the two periods not needed
  filter(timespan != "")

# Bring into wide format
traces <- traces00 %>%
  # month before W2 does not matter for US, before W3 not for PL
  mutate(keep = case_when(
    timespan == "month_before_w2" & country == "US" ~ 0,
    timespan == "month_before_w3" & country == "PL" ~ 0,
    TRUE ~ 1)) %>%
  filter(keep == 1) %>% select(-keep) %>%
  mutate(timespan = case_when(
    timespan %in% c("month_before_w2", "month_before_w3") ~ "before",
    timespan %in% c("during_no_news", "during_more_news") ~ "during")) %>%
  arrange(person_id, timespan) %>%
  pivot_wider(names_from = "timespan", 
              values_from = c("u_visits", "active_days", 
                              "news_visits", "news_ideo", "news_ideo_alt",
                              "pol_visits", "pol_news_visits"))

# Join surveys

survey_all <- read.csv(paste0(datadir, "processed/survey_all.csv"))

data_wide00 <- left_join(
  survey_all, traces %>% select(-country), 
  by = "person_id")

# PRIOR EXPOSURE ===============================================================

# PAP: "...we calculate respondents' prior level of news consumption by calculating
# the average number of times a day respondents accessed unique URLs from news
# domains the month before completing the pre-survey (we will use fewer days to 
# calculate this average if respondents provided browsing data for less than 30 days)"

# PAP: "...the domain-level ideology scores will be attached to all the visits/URLs 
# to/from that domain, and then we will use the resulting scores to calculate 
# the final individual-level average."

# Additional decision: If 0 active days, then prior exposure gets 0 

data_wide01 <- data_wide00 %>% 
  # prior news exposure
  mutate(
    news_before_mean = ifelse( 
      active_days_before != 0, news_visits_before / active_days_before, 0),
    pol_before_mean = ifelse( 
      active_days_before != 0, pol_visits_before / active_days_before, 0),
    pol_news_before_mean = ifelse( 
      active_days_before != 0, pol_news_visits_before / active_days_before, 0)) %>%
  # prior news exposure ideological congeniality
  mutate(news_like = scale(news_ideo_before)) %>%
  mutate(news_like = ifelse(leftright == "left", news_like*(-1), news_like))
  
# COMPLIANCE ===================================================================

# PAP: "...we will create an individual-level variable measuring compliance: 
# the percentage increase in the average number of unique news URLs accessed 
# before the wave 2 survey, and between the completion of the pre- and post-surveys."

# Additional decision: If 0 active days, then prior exposure gets 0 
  
data_wide02 <- data_wide01 %>% 
  mutate(
    news_during_mean = ifelse(
      active_days_during != 0, news_visits_during / active_days_during, 0)) %>%
  # Alternative compliance measures
  mutate(
    pol_during_mean = ifelse(
      active_days_during != 0, pol_visits_during / active_days_during, 0),
    pol_news_during_mean = ifelse(
      active_days_during != 0, pol_news_visits_during / active_days_during, 0))

data_wide <- data_wide02 %>% 
  # Continuous absolute change measure
  mutate(
    news_change_meanabs = news_during_mean - news_before_mean, 
    pol_change_meanabs = pol_during_mean - pol_before_mean,
    pol_news_change_meanabs = pol_news_during_mean - pol_news_before_mean) %>%
  mutate(
    compl_meanabs = ifelse(country == "US",
                           news_change_meanabs * (-1), news_change_meanabs),
    pol_compl_meanabs = ifelse(country == "US",
                               pol_change_meanabs * (-1), pol_change_meanabs),
    pol_news_compl_meanabs = ifelse(country == "US",
                                    pol_news_change_meanabs * (-1), pol_news_change_meanabs))

# EXPORT =======================================================================

write.csv(data_wide, paste0(datadir, "processed/data_wide.csv"), row.names = F)

